library(Matrix)
library(Matrix.utils)
library(scran)
library(SingleCellExperiment)
library(tidyverse)
library(vroom)
library(caret)
library(CelliD)
library(corrplot)
# Load subsampled fetal cell atlas data
fca_counts <- readRDS("/project2/gilad/jpopp/ebQTL/data/fca/counts.sampled.rds")
fca_cells <- readRDS("/project2/gilad/jpopp/ebQTL/data/fca/cell_metadata.rds")
fca_genes <- readRDS("/project2/gilad/jpopp/ebQTL/data/fca/gene_metadata.rds") %>%
mutate_all(as.character)
# Subset cells to those in the subsampled data
fca_cells <- fca_cells %>%
filter(sample %in% colnames(fca_counts))
# Subset genes to protein-coding genes without duplicated HGNC entries
pull_gene_type <- function(attr) {
str_split(attr, "\"")[[1]][6]
}
pull_gene_name <- function(attr) {
str_split(attr, "\"")[[1]][8]
}
pull_gene_id <- function(attr) {
str_split(attr, "\"")[[1]][2]
}
gencode <- vroom("/project2/gilad/kenneth/References/human/cellranger/cellranger4.0/refdata-gex-GRCh38-2020-A/genes/genes.gtf",
col_names=c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"), skip=5) %>%
filter(seqname %in% paste0("chr", seq(1, 22))) %>%
filter(feature == "gene") %>%
mutate(type=map_chr(attribute, pull_gene_type)) %>%
mutate(ensg=map_chr(attribute, pull_gene_id)) %>%
mutate(hgnc=map_chr(attribute, pull_gene_name)) %>%
filter(type=="protein_coding")
Rows: 2765969 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): seqname, source, feature, score, strand, frame, attribute
dbl (2): start, end
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fca_genes <- mutate(fca_genes, gene_id_short=str_extract(gene_id, "[^.]+"))
fca_pc_genes <- filter(fca_genes,
(gene_short_name %in% gencode$hgnc) & (gene_id_short %in% gencode$ensg))
hgnc_duplicates <- fca_pc_genes %>%
group_by(gene_short_name) %>%
count() %>%
filter(n > 1) %>%
pull(gene_short_name)
fca_pc_genes <- fca_pc_genes %>%
filter(!gene_short_name %in% hgnc_duplicates)
# Make the counts matrix match the metadata
fca_counts <- fca_counts[fca_pc_genes$gene_id,fca_cells$sample]
rownames(fca_counts) <- fca_pc_genes$gene_short_name
# Create SingleCellExperiment object
fca <- SingleCellExperiment(list(counts=fca_counts),
colData=fca_cells)
saveRDS(fca, "/project2/gilad/jpopp/ebQTL/data/fca/counts.sampled.sce")
Now, just as the original paper subsampled each (organ, cell type) to have at most 5K cells, we want to do the same with the major cell types (not organ). Otherwise cell types like vascular endothelial cells which were collected from multiple organs are way over-represented.
First, however, I’m going to remove placental cells and uncharacterized cell types which were very similar to a characterized cell type, or were likely contamination
fca$celltype <- str_extract(fca$Organ_cell_lineage, "[^-]+$")
omitted_types <- c("AFP_ALB positive cells",
"CCL19_CCL21 positive cells",
"CLC_IL5RA positive cells",
"CSH1_CSH2 positive cells",
"ELF3_AGBL2 positive cells",
"MUC13_DMBT1 positive cells",
"PDE11A_FAM19A2 positive cells",
"PDE1C_ACSM3 positive cells",
"SATB2_LRRC7 positive cells",
"SKOR2_NPSR1 positive cells",
"SLC24A4_PEX5L positive cells",
"SLC26A4_PAEP positive cells")
celltype_sampler <- as_tibble(colData(fca)) %>%
select(sample, Organ, celltype) %>%
mutate(celltype_revised=celltype) %>%
mutate(celltype_revised=if_else(Organ=="Placenta", true=NA_character_, false=celltype_revised)) %>%
mutate(celltype_revised=if_else(celltype %in% omitted_types, true=NA_character_, false=celltype_revised)) %>%
drop_na() %>%
mutate(celltype=celltype_revised, .keep="unused") %>%
group_by(celltype) %>%
add_count(celltype)
Now, since we’re training a classifier, I am going to remove cell types with fewer than 500 cells which may not offer enough information to obtain reliable expression signatures.
celltype_sampler %>%
filter(n < 500) %>%
select(celltype) %>%
unique()
celltype_sampler <- celltype_sampler %>%
filter(n >= 500)
With these rarer cell types removed, we’ll subsample the more common cell types to 5K cells
keepers_rare <- celltype_sampler %>%
filter(n <= 5000) %>%
pull(sample)
keepers_sampled <- celltype_sampler %>%
filter(n > 5000) %>%
slice_sample(n=5000) %>%
pull(sample)
fca <- fca[,c(keepers_rare, keepers_sampled)]
Now, I’ll perform scran normalization and dimensionality reduction
libs <- librarySizeFactors(fca)
sizeFactors(fca) <- libs
fca <- logNormCounts(fca)
fca.dec <- modelGeneVar(fca)
hvg.fca <- getTopHVGs(fca.dec, fdr.threshold=0.1)
fca.pca <- prcomp(t(logcounts(fca)[hvg.fca,]), rank=50)
Warning in asMethod(object) :
sparse->dense coercion: allocating vector of size 1.2 GiB
reducedDims(fca) <- list(PCA=fca.pca$x)
fca <- RunMCA(fca, nmcs=50, features=hvg.fca)
Warning in asMethod(object) :
sparse->dense coercion: allocating vector of size 29.1 GiB
Computing Fuzzy Matrix
7.782 sec elapsed
Computing SVD
145.191 sec elapsed
Computing Coordinates
1.722 sec elapsed
saveRDS(fca, "/project2/gilad/jpopp/ebQTL/data/fca/counts.sampled.mca_pca_fromhvgs.sce")
This gives us a list of 56 major cell types, are those appropriate to use for a new classifier?
To look into this, I’m going to test the classifier by trying to annotate the original dataset
fca_signatures <- GetGroupGeneSet(fca, group.by="celltype", n.features=100)
creating ranking
| | 0 % ~calculating
|+ | 2 % ~00s
|++ | 4 % ~00s
|+++ | 5 % ~00s
|++++ | 7 % ~00s
|+++++ | 9 % ~00s
|++++++ | 11% ~00s
|+++++++ | 12% ~00s
|++++++++ | 14% ~00s
|+++++++++ | 16% ~00s
|+++++++++ | 18% ~00s
|++++++++++ | 20% ~00s
|+++++++++++ | 21% ~00s
|++++++++++++ | 23% ~00s
|+++++++++++++ | 25% ~00s
|++++++++++++++ | 27% ~00s
|+++++++++++++++ | 29% ~00s
|++++++++++++++++ | 30% ~00s
|+++++++++++++++++ | 32% ~00s
|+++++++++++++++++ | 34% ~00s
|++++++++++++++++++ | 36% ~00s
|+++++++++++++++++++ | 38% ~00s
|++++++++++++++++++++ | 39% ~00s
|+++++++++++++++++++++ | 41% ~00s
|++++++++++++++++++++++ | 43% ~00s
|+++++++++++++++++++++++ | 45% ~00s
|++++++++++++++++++++++++ | 46% ~00s
|+++++++++++++++++++++++++ | 48% ~00s
|+++++++++++++++++++++++++ | 50% ~00s
|++++++++++++++++++++++++++ | 52% ~00s
|+++++++++++++++++++++++++++ | 54% ~00s
|++++++++++++++++++++++++++++ | 55% ~00s
|+++++++++++++++++++++++++++++ | 57% ~00s
|++++++++++++++++++++++++++++++ | 59% ~00s
|+++++++++++++++++++++++++++++++ | 61% ~00s
|++++++++++++++++++++++++++++++++ | 62% ~00s
|+++++++++++++++++++++++++++++++++ | 64% ~00s
|++++++++++++++++++++++++++++++++++ | 66% ~00s
|++++++++++++++++++++++++++++++++++ | 68% ~00s
|+++++++++++++++++++++++++++++++++++ | 70% ~00s
|++++++++++++++++++++++++++++++++++++ | 71% ~00s
|+++++++++++++++++++++++++++++++++++++ | 73% ~00s
|++++++++++++++++++++++++++++++++++++++ | 75% ~00s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~00s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~00s
|+++++++++++++++++++++++++++++++++++++++++ | 80% ~00s
|++++++++++++++++++++++++++++++++++++++++++ | 82% ~00s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~00s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~00s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
cellid_enrichments <- RunCellHGT(fca, pathways = fca_signatures, features=hvg.fca, dims = 1:50, n.features=100)
Error in checkCelliDArg.SingleCellExperiment(X = X, reduction = reduction, :
object 'hvg.fca' not found
This is pretty good performance, although some cell types are driving confusion.
To understand which cell types are most similar or distinct, I’m going to look into the overlap between signature gene sets.
celltypes <- unique(names(fca_signatures))
ct_dists <- matrix(nrow=length(celltypes), ncol=length(celltypes))
rownames(ct_dists) <- celltypes
colnames(ct_dists) <- celltypes
for (ct1 in seq_len(length(celltypes))) {
for (ct2 in seq_len(length(celltypes))) {
ct_dists[ct1, ct2] = 100 - length(intersect(fca_signatures[[ct1]], fca_signatures[[ct2]]))
}
}
hc <- hclust(as.dist(ct_dists))
plot(hc, cex=0.8)
This calls for another round of merging cells based on function and gene set similarity
First, I’ll consolidate the nervous system cells into a few groups: CNS neurons, CNS glia, PNS neurons, PNS glia, eye, and neuroendocrine
cns_neuronal_types <- c("Limbic system neurons", "Inhibitory interneurons",
"Inhibitory neurons", "Excitatory neurons",
"Unipolar brush cells", "Granule neurons",
"Purkinje neurons")
cns_glial_types <- c("Microglia", "Astrocytes", "Oligodendrocytes")
pns_neuronal_types <- c("Visceral neurons", "ENS neurons")
pns_glial_types <- c("ENS glia", "Satellite cells", "Schwann cells")
eye_types <- c("Retinal progenitors and Muller glia", "Amacrine cells",
"Bipolar cells", "Ganglion cells", "Retinal pigment cells",
"Horizontal cells", "Photoreceptor cells")
neuroendocrine_types <- c("Chromaffin cells", "Islet endocrine cells",
"Neuroendocrine cells", "Sympathoblasts")
celltype_sampler <- celltype_sampler %>%
mutate(celltype=if_else(celltype %in% cns_neuronal_types, true="CNS neurons", false=celltype)) %>%
mutate(celltype=if_else(celltype %in% cns_glial_types, true="CNS glia", false=celltype)) %>%
mutate(celltype=if_else(celltype %in% pns_neuronal_types, true="PNS neurons", false=celltype)) %>%
mutate(celltype=if_else(celltype %in% pns_glial_types, true="PNS glia", false=celltype)) %>%
mutate(celltype=if_else(celltype %in% eye_types, true="Eye cells", false=celltype)) %>%
mutate(celltype=if_else(celltype %in% neuroendocrine_types, true="Neuroendocrine cells", false=celltype))
Now, for immune cells. I’m going to remove both myeloid cells and hematopoietic stem/ progenitor cells since they’re ancestors of other cell types that are present here. I’ll also lump thymocytes under lymphoid cells since they’re functionally very similar.
celltype_sampler <- celltype_sampler %>%
mutate(celltype=if_else(celltype %in% c("Myeloid cells", "Hematopoietic stem cells"), true=NA_character_, false=celltype)) %>%
mutate(celltype=if_else(celltype == "Thymocytes", true="Lymphoid cells", false=celltype)) %>%
drop_na()
Now, I need to update the dataset to contain even samples from these groups
celltype_subsetter <- celltype_sampler %>%
select(-n) %>%
group_by(celltype) %>%
add_count(celltype)
keepers_rare <- celltype_subsetter %>%
filter(n <= 5000) %>%
pull(sample)
keepers_sampled <- celltype_subsetter %>%
filter(n > 5000) %>%
slice_sample(n=5000) %>%
pull(sample)
celltype_subsetter <- celltype_subsetter %>%
filter(sample %in% c(keepers_rare, keepers_sampled))
Now, I’ll do the same thing to validate a classifier based on these revised cell types
fca_subsampled <- fca[,celltype_subsetter$sample]
fca_subsampled$celltype <- celltype_subsetter$celltype
fca_subsampled.dec <- modelGeneVar(fca_subsampled)
hvg.fca_subsampled <- getTopHVGs(fca_subsampled.dec, fdr.threshold=0.1)
fca_subsampled.pca <- prcomp(t(logcounts(fca_subsampled)[hvg.fca_subsampled,]), rank=50)
Warning in asMethod(object) :
sparse->dense coercion: allocating vector of size 2.1 GiB
reducedDims(fca_subsampled) <- list(PCA=fca_subsampled.pca$x)
fca_subsampled <- RunMCA(fca_subsampled, nmcs=50, features=hvg.fca_subsampled)
Warning in asMethod(object) :
sparse->dense coercion: allocating vector of size 18.2 GiB
Computing Fuzzy Matrix
14.694 sec elapsed
Computing SVD
31.578 sec elapsed
Computing Coordinates
3.147 sec elapsed
Check the performance of a Cell ID classifier on the dataset it came from (FCA)
new_signatures <- GetGroupGeneSet(fca_subsampled, group.by="celltype", n.features=100)
creating ranking
| | 0 % ~calculating
|++ | 3 % ~00s
|++++ | 6 % ~00s
|+++++ | 9 % ~00s
|+++++++ | 12% ~00s
|++++++++ | 15% ~00s
|++++++++++ | 18% ~00s
|+++++++++++ | 21% ~00s
|+++++++++++++ | 24% ~00s
|++++++++++++++ | 27% ~00s
|++++++++++++++++ | 30% ~00s
|+++++++++++++++++ | 33% ~00s
|+++++++++++++++++++ | 36% ~00s
|++++++++++++++++++++ | 39% ~00s
|++++++++++++++++++++++ | 42% ~00s
|+++++++++++++++++++++++ | 45% ~00s
|+++++++++++++++++++++++++ | 48% ~00s
|++++++++++++++++++++++++++ | 52% ~00s
|++++++++++++++++++++++++++++ | 55% ~00s
|+++++++++++++++++++++++++++++ | 58% ~00s
|+++++++++++++++++++++++++++++++ | 61% ~00s
|++++++++++++++++++++++++++++++++ | 64% ~00s
|++++++++++++++++++++++++++++++++++ | 67% ~00s
|+++++++++++++++++++++++++++++++++++ | 70% ~00s
|+++++++++++++++++++++++++++++++++++++ | 73% ~00s
|++++++++++++++++++++++++++++++++++++++ | 76% ~00s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~00s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~00s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~00s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
new_cellid_enrichments <- RunCellHGT(fca_subsampled, pathways = new_signatures, features=hvg.fca_subsampled, dims = 1:50, n.features=100)
calculating distance
ranking genes